Escape Room Survey

Dependencies

In [16]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(color_codes=True)
from scipy.stats import chi2_contingency, ranksums
from pandas import Series, DataFrame
from scipy import stats

# if matplotlib is not set inline, you will not see plots
#alternatives auto gtk gtk2 inline osx qt qt5 wx tk
#%matplotlib auto
#%matplotlib qt
%matplotlib inline

Import data

In [17]:
# path = "datos.csv"
# df = pd.read_csv(path)
from azureml import Workspace

ws = Workspace()
ds = ws.datasets['ERDatos.csv']
df = ds.to_dataframe()
df = pd.DataFrame(data=df)

# df = df.replace({' \(Muy de acuerdo\)': ''}, regex=True)
# df = df.replace({' \(Muy en desacuerdo\)': ''}, regex=True)
df.head()
Out[17]:
Timestamp Age Sex GeneralOpinion LikeGames EasyIWEB LearningEffectiveness Engagement Difficulty Organisation PreferOverLab LearnMoreThanLab OtherSubjects Recommend Opinion
0 2018/12/17 1:16:49 p. m. CET 22 Mujer 4 5 2 2 4 5 5 5 4 Si No NaN
1 2018/12/17 1:21:04 p. m. CET 45 Hombre 3 5 NS/NC NS/NC 2 4 2 2 NS/NC Si Si NaN
2 2018/12/17 1:21:06 p. m. CET 22 Hombre 4 4 2 3 4 2 2 4 5 Si Si Mejorar la interfaz de usuario, la pantalla de...
3 2018/12/17 1:21:21 p. m. CET 22 Hombre 4 4 3 4 3 4 5 4 4 Si Si Enhorabuena por el trabajo, nos lo hemos pasad...
4 2018/12/17 1:21:21 p. m. CET 20 Mujer 5 3 3 4 5 3 5 5 4 Si Si NaN
In [18]:
df.dtypes
Out[18]:
Timestamp                object
Age                       int64
Sex                      object
GeneralOpinion            int64
LikeGames                 int64
EasyIWEB                 object
LearningEffectiveness    object
Engagement                int64
Difficulty               object
Organisation             object
PreferOverLab            object
LearnMoreThanLab         object
OtherSubjects            object
Recommend                object
Opinion                  object
dtype: object
In [19]:
numList = [
    "Age",
    "LikeGames",
    "EasyIWEB",
    "LearningEffectiveness",
    "Engagement",
    "Difficulty",
    "Organisation",
    "PreferOverLab",
    "LearnMoreThanLab",
]
for str in numList:
    df[str] = df[str].replace("NS/NC", None)
    mean = df[df[str] != None][str].apply(lambda x: float(x)).mean()
    df[str] = df[str].replace("NS/NC", mean)
    df[str] = df[str].apply(lambda x: float(x))
In [20]:
df.describe(include="all").T
Out[20]:
count unique top freq mean std min 25% 50% 75% max
Timestamp 64 57 2018/12/17 1:21:28 p. m. CET 2 NaN NaN NaN NaN NaN NaN NaN
Age 64 NaN NaN NaN 22.0469 3.05728 20 21 21 22 45
Sex 64 2 Hombre 43 NaN NaN NaN NaN NaN NaN NaN
GeneralOpinion 64 NaN NaN NaN 4.28125 0.863157 1 4 4 5 5
LikeGames 64 NaN NaN NaN 4.20312 0.894067 2 4 4 5 5
EasyIWEB 64 NaN NaN NaN 2.79688 0.945829 1 2 3 3 5
LearningEffectiveness 64 NaN NaN NaN 3.35938 1.23914 1 2.75 3.5 4 5
Engagement 64 NaN NaN NaN 4.20312 1.1294 1 4 5 5 5
Difficulty 64 NaN NaN NaN 3.4375 0.888641 1 3 3.5 4 5
Organisation 64 NaN NaN NaN 3.90625 1.01916 1 3 4 5 5
PreferOverLab 64 NaN NaN NaN 4.42188 0.939515 1 4 5 5 5
LearnMoreThanLab 64 NaN NaN NaN 3.70312 1.28087 1 3 4 5 5
OtherSubjects 64 2 Si 60 NaN NaN NaN NaN NaN NaN NaN
Recommend 64 2 Si 56 NaN NaN NaN NaN NaN NaN NaN
Opinion 19 19 Creo que es una gran iniciativa para incentiva... 1 NaN NaN NaN NaN NaN NaN NaN
In [21]:
def checkHypotheses(name, p_val):
    print(name)
    print("   The p value is", p_val)
    if p_val < 0.05 :
        print("   The null hyphotheses is rejected: " +
              "The two samples are statistically different")
    else :
        print("   Failed to reject the null hypotheses: " +
              "The two samples are alike")
    print('**************************************' +
          '**************************************')

Gender bias

In [22]:
st = df.groupby(['GeneralOpinion', 'Sex'])['GeneralOpinion'].count().unstack('Sex').fillna(0)
st.plot(kind='bar', stacked=True)
Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x7ff57ffaeeb8>
In [23]:
st = df.groupby(['LikeGames', 'Sex'])['LikeGames'].count().unstack('Sex').fillna(0)
st.plot(kind='bar', stacked=True)
Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x7ff57fe5ccc0>
In [24]:
st = df.groupby(['Engagement', 'Sex'])['Engagement'].count().unstack('Sex').fillna(0)
st.plot(kind='bar', stacked=True)
Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x7ff5803395f8>
In [25]:
womenLikeGames = df[df['Sex']=="Mujer"]['LikeGames']
menLikeGames = df[df['Sex']=="Hombre"]['LikeGames']
womenLikeER = df[df['Sex']=="Mujer"]['Engagement']
menLikeER = df[df['Sex']=="Hombre"]['Engagement']
womenEasy = df[df['Sex']=="Mujer"]['EasyIWEB']
menEasy = df[df['Sex']=="Hombre"]['EasyIWEB']
womenGeneral = df[df['Sex']=="Mujer"]['GeneralOpinion']
menGeneral = df[df['Sex']=="Hombre"]['GeneralOpinion']
womenKnowledge = df[df['Sex']=="Mujer"]['LearningEffectiveness']
menKnowledge = df[df['Sex']=="Hombre"]['LearningEffectiveness']
womenLevel = df[df['Sex']=="Mujer"]['Difficulty']
menLevel = df[df['Sex']=="Hombre"]['Difficulty']
womenOrganised = df[df['Sex']=="Mujer"]['Organisation']
menOrganised = df[df['Sex']=="Hombre"]['Organisation']
womenPrefer = df[df['Sex']=="Mujer"]['PreferOverLab']
menPrefer = df[df['Sex']=="Hombre"]['PreferOverLab']
womenLearn = df[df['Sex']=="Mujer"]['LearnMoreThanLab']
menLearn = df[df['Sex']=="Hombre"]['LearnMoreThanLab']
womenOther = df[df['Sex']=="Mujer"]['OtherSubjects']
menOther = df[df['Sex']=="Hombre"]['OtherSubjects']
womenRecommend = df[df['Sex']=="Mujer"]['Recommend']
menRecommend = df[df['Sex']=="Hombre"]['Recommend']
In [26]:
womenLikeGames.describe(include="all").T
Out[26]:
count    21.000000
mean      3.523810
std       0.928388
min       2.000000
25%       3.000000
50%       4.000000
75%       4.000000
max       5.000000
Name: LikeGames, dtype: float64
In [27]:
menLikeGames.describe(include="all").T
Out[27]:
count    43.000000
mean      4.534884
std       0.667220
min       3.000000
25%       4.000000
50%       5.000000
75%       5.000000
max       5.000000
Name: LikeGames, dtype: float64
In [28]:
z_stat, p_val = ranksums(menLikeER, womenLikeER) 
checkHypotheses("Engagement",p_val)
z_stat, p_val = ranksums(menLikeGames, womenLikeGames) 
checkHypotheses("Games",p_val)
z_stat, p_val = ranksums(menKnowledge, womenKnowledge) 
checkHypotheses("Learning Effectiveness",p_val)
z_stat, p_val = ranksums(menEasy, womenEasy) 
checkHypotheses("Easy IWEB",p_val)
z_stat, p_val = ranksums(menGeneral, womenGeneral) 
checkHypotheses("General opinion",p_val)
z_stat, p_val = ranksums(menLevel, womenLevel) 
checkHypotheses("Difficulty ER",p_val)
z_stat, p_val = ranksums(menOrganised, womenOrganised) 
checkHypotheses("Organisation",p_val)
z_stat, p_val = ranksums(menPrefer, womenPrefer) 
checkHypotheses("Prefer over lab",p_val)
z_stat, p_val = ranksums(menLearn, womenLearn) 
checkHypotheses("Learning effectiveness over lab",p_val)
z_stat, p_val = ranksums(menRecommend, womenRecommend) 
checkHypotheses("Recommend",p_val)
Engagement
   The p value is 0.348989636302
   Failed to reject the null hypotheses: The two samples are alike
****************************************************************************
Games
   The p value is 0.000119918416005
   The null hyphotheses is rejected: The two samples are statistically different
****************************************************************************
Learning Effectiveness
   The p value is 0.23531662741
   Failed to reject the null hypotheses: The two samples are alike
****************************************************************************
Easy IWEB
   The p value is 0.515316911072
   Failed to reject the null hypotheses: The two samples are alike
****************************************************************************
General opinion
   The p value is 0.965784868225
   Failed to reject the null hypotheses: The two samples are alike
****************************************************************************
Difficulty ER
   The p value is 0.224224489233
   Failed to reject the null hypotheses: The two samples are alike
****************************************************************************
Organisation
   The p value is 0.0374861296661
   The null hyphotheses is rejected: The two samples are statistically different
****************************************************************************
Prefer over lab
   The p value is 0.224224489233
   Failed to reject the null hypotheses: The two samples are alike
****************************************************************************
Learning effectiveness over lab
   The p value is 0.390942609374
   Failed to reject the null hypotheses: The two samples are alike
****************************************************************************
Recommend
   The p value is 0.529261722029
   Failed to reject the null hypotheses: The two samples are alike
****************************************************************************

More insights

In [29]:
df.corr()
Out[29]:
Age GeneralOpinion LikeGames EasyIWEB LearningEffectiveness Engagement Difficulty Organisation PreferOverLab LearnMoreThanLab
Age 1.000000 -0.167480 0.106795 -0.095461 -0.113455 -0.228055 0.079969 -0.227809 -0.283300 0.060358
GeneralOpinion -0.167480 1.000000 0.048207 0.246072 0.631187 0.836006 0.064669 0.607849 0.458140 0.464361
LikeGames 0.106795 0.048207 1.000000 0.368665 -0.009626 0.052808 0.026222 0.003811 0.047537 -0.071253
EasyIWEB -0.095461 0.246072 0.368665 1.000000 0.428944 0.262128 -0.062557 0.259863 0.080102 0.145966
LearningEffectiveness -0.113455 0.631187 -0.009626 0.428944 1.000000 0.627534 -0.116221 0.479581 0.372176 0.538324
Engagement -0.228055 0.836006 0.052808 0.262128 0.627534 1.000000 0.068205 0.582202 0.621040 0.426386
Difficulty 0.079969 0.064669 0.026222 -0.062557 -0.116221 0.068205 1.000000 0.081059 -0.015447 0.032249
Organisation -0.227809 0.607849 0.003811 0.259863 0.479581 0.582202 0.081059 1.000000 0.356929 0.306645
PreferOverLab -0.283300 0.458140 0.047537 0.080102 0.372176 0.621040 -0.015447 0.356929 1.000000 0.488243
LearnMoreThanLab 0.060358 0.464361 -0.071253 0.145966 0.538324 0.426386 0.032249 0.306645 0.488243 1.000000

Learning Effectiveness vs. Engagement

In [30]:
st = df.groupby(['LearningEffectiveness', 'Engagement'])['LearningEffectiveness'].count().unstack('Engagement').fillna(0)
st.plot(kind='bar', stacked=True)
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x7ff57fb94080>
In [31]:
np.corrcoef(df['LearningEffectiveness'], df['Engagement'])
Out[31]:
array([[ 1.      ,  0.627534],
       [ 0.627534,  1.      ]])
In [32]:
THRESHOLD = 2
effective = df['LearningEffectiveness']>THRESHOLD
engaging = df['Engagement']>THRESHOLD
z_stat, p_val = ranksums(effective, engaging) 
checkHypotheses("Are learning effectiveness and engagement equally distributed?",p_val)
Are learning effectiveness and engagement equally distributed?
   The p value is 0.169911841731
   Failed to reject the null hypotheses: The two samples are alike
****************************************************************************
In [33]:
np.corrcoef(effective, engaging)
Out[33]:
array([[ 1.        ,  0.49136232],
       [ 0.49136232,  1.        ]])
In [34]:
THRESHOLD = 3
effective = df['LearningEffectiveness']>THRESHOLD
engaging = df['Engagement']>THRESHOLD
z_stat, p_val = ranksums(effective, engaging) 
checkHypotheses("Are learning effectiveness and engagement equally distributed?",p_val)
Are learning effectiveness and engagement equally distributed?
   The p value is 0.00952879138632
   The null hyphotheses is rejected: The two samples are statistically different
****************************************************************************
In [35]:
np.corrcoef(effective, engaging)
Out[35]:
array([[ 1.        ,  0.40574111],
       [ 0.40574111,  1.        ]])

Learning Effectiveness vs. EasyIWEB

In [36]:
st = df.groupby(['LearningEffectiveness', 'EasyIWEB'])['LearningEffectiveness'].count().unstack('EasyIWEB').fillna(0)
st.plot(kind='bar', stacked=True)
Out[36]:
<matplotlib.axes._subplots.AxesSubplot at 0x7ff57faa3b38>
In [37]:
np.corrcoef(df['LearningEffectiveness'], df['EasyIWEB'])
Out[37]:
array([[ 1.        ,  0.42894368],
       [ 0.42894368,  1.        ]])
In [38]:
THRESHOLD = 2
effective = df['LearningEffectiveness']>THRESHOLD
easy = df['EasyIWEB']>THRESHOLD
z_stat, p_val = ranksums(effective, easy) 
checkHypotheses("Are learning effectiveness and opinion on IWEB level equally distributed?",p_val)
Are learning effectiveness and opinion on IWEB level equally distributed?
   The p value is 0.222469210665
   Failed to reject the null hypotheses: The two samples are alike
****************************************************************************
In [39]:
THRESHOLD = 3
effective = df['LearningEffectiveness']>THRESHOLD
easy = df['EasyIWEB']>THRESHOLD
z_stat, p_val = ranksums(effective, easy) 
checkHypotheses("Are learning effectiveness and opinion on IWEB level equally distributed?",p_val)
Are learning effectiveness and opinion on IWEB level equally distributed?
   The p value is 0.00605156428839
   The null hyphotheses is rejected: The two samples are statistically different
****************************************************************************